rm(list=ls())

library(Hmisc)
library(mvnormtest)
library(lavaan)
library(semTools)
library(performance)
library(car)
library(ggeffects)
library(ggplot2)
library(MuMIn)



rev.score<-function(x) (max(x,na.rm=TRUE)+1)-x

#set appropriate working directory
#setwd("~/Documents")

dataf=data.frame(read.csv("StatsTaskValuesData.csv",header=T,na.strings=""))
names(dataf)
nrow(dataf)
#n=360

########################    Data Cleaning ##############################

#Make unique ID that accounts for each class section
#dataf<-dataf %>%
#  mutate(unique.section = paste(semester, school, sep = "-"))
#table(dataf$unique.section)
#Only 4 unique sections


#Reverse-score items
dataf$A2_2.rev<-rev.score(dataf$A2_2)
dataf$A2_5.rev<-rev.score(dataf$A2_5)


############################### Descriptive Stats for Items ##################

#make data set with just the value items
items<-dataf[,3:39]

#histograms of items
par(mfcol=c(7,6), oma=c(1,1,0,0), mar=c(1,1,1,0), tcl=-0.1, mgp=c(0,0,0))
hist.data.frame(items)
#some variables are skewed, some are not
#Definitely non-normality for some items
par(mfcol=c(1,1))

#Test for multivariate normality
items.mat<-as.matrix(items)
items.mat<-t(items.mat)
mshapiro.test(items.mat)
#p < 2.2 x 10^-16
#rejects the assumption of multivariate normality - must use a robust estimator in CFAs

#Item Means and SDs
colMeans(dataf[,3:39],na.rm=T)
sapply(dataf[,3:39],sd,na.rm=T)

#Item correlations - Appendix B, Table S1
options(max.print=999999)
cor(cbind(dataf[,3:39]))



################### Separate tests within constructs - following methods of Gaspard et al., 2015 ###############################


###########  Interest ###############################
#should only be one factor

int.model<-'Interest = ~I1_1 + I1_2 + I1_3 + I1_4'

fit.int<-cfa(int.model, data = dataf, estimator = "MLR")
summary(fit.int, fit.measures = TRUE, standardized = TRUE)

#Correlation residuals
resid(fit.int,type="cor")


##########  Attainment Value  ######################
#2 hypothesized factors according to Gaspard
#Gaspard correlated the residuals of the two negatively worded items

#Facets Model
attain.model.1<-'
Attain1 =~ A1_1 + A1_2 + A1_3 + A1_4
Attain2 =~ A2_1 + A2_2.rev + A2_3 + A2_4 + A2_5.rev + A2_6

#Correlate residuals of negatively worded items
A2_2.rev~~A2_5.rev'


#Single Construct
attain.model.2<-'
Attain =~ A1_1 + A1_2 + A1_3 + A1_4 + A2_1 + A2_2.rev + A2_3 + A2_4 + A2_5.rev + A2_6

#Correlate residuals of negatively worded items
A2_2.rev~~A2_5.rev'


fit.attain.1<-cfa(attain.model.1, data = dataf, estimator = "MLR")
summary(fit.attain.1, fit.measures = TRUE, standardized = TRUE)
resid(fit.attain.1,type="cor")


fit.attain.2<-cfa(attain.model.2, data = dataf, estimator = "MLR")
summary(fit.attain.2, fit.measures = TRUE, standardized = TRUE)



###############  Utility Value  ############################
#Only looking at U1 - U4

#Facets Model
uty.model.1<-'
Utility1 = ~U1_1 + U1_2
Utility2 = ~U2_1 + U2_2 + U2_3
Utility3 =~ U3_1 + U3_2 + U3_3
Utility4 = ~U4_1 + U4_2'

#Single Construct
uty.model.2<-'
Utility =~ U1_1 + U1_2 + U2_1 + U2_2 + U2_3 + U3_1 + U3_2 + U3_3 + U4_1 + U4_2'


fit.uty.1<-cfa(uty.model.1, data = dataf, estimator = "MLR")
summary(fit.uty.1, fit.measures = TRUE, standardized = TRUE)
resid(fit.uty.1,type="cor")

fit.uty.2<-cfa(uty.model.2, data = dataf, estimator = "MLR")
summary(fit.uty.2, fit.measures = TRUE, standardized = TRUE)



####################### Cost ###############################
#3 hypothesized facets

cost.model.1<-'
Cost1 = ~C1_1 + C1_2 + C1_3 + C1_4
Cost2 = ~C2_1 + C2_2 + C2_3 + C2_4
Cost3 =~ C3_1 + C3_2 + C3_3'

cost.model.2<-'
Cost = ~C1_1 + C1_2 + C1_3 + C1_4 + C2_1 + C2_2 + C2_3 + C2_4 + C3_1 + C3_2 + C3_3'


fit.cst.1<-cfa(cost.model.1, data = dataf, estimator = "MLR")
summary(fit.cst.1, fit.measures = TRUE, standardized = TRUE)
resid(fit.cst.1,type="cor")

fit.cst.2<-cfa(cost.model.2, data = dataf, estimator = "MLR")
summary(fit.cst.2, fit.measures = TRUE, standardized = TRUE)






############################   CFA on the ENTIRE Scale  ##########################################

facets.model<-'
Interest = ~I1_1 + I1_2 + I1_3 + I1_4
Attain1 =~ A1_1 + A1_2 + A1_3 + A1_4
Attain2 =~ A2_1 + A2_2.rev + A2_3 + A2_4 + A2_5.rev + A2_6
Utility1 = ~U1_1 + U1_2
Utility2 = ~U2_1 + U2_2 + U2_3
Utility3 = ~U3_1 + U3_2 + U3_3
Utility4 = ~U4_1 + U4_2
Cost1 = ~C1_1 + C1_2 + C1_3 + C1_4
Cost2 = ~C2_1 + C2_2 + C2_3 + C2_4
Cost3 = ~C3_1 + C3_2 + C3_3

#Correlate residuals of negatively worded items
A2_2.rev~~A2_5.rev'


values.model<-'
Interest = ~I1_1 + I1_2 + I1_3 + I1_4
Attain =~ A1_1 + A1_2 + A1_3 + A1_4 + A2_1 + A2_2.rev + A2_3 + A2_4 + A2_5.rev + A2_6
Utility = ~U1_1 + U1_2 + U2_1 + U2_2 + U2_3 + U3_1 + U3_2 + U3_3 + U4_1 + U4_2
Cost = ~C1_1 + C1_2 + C1_3 + C1_4 + C2_1 + C2_2 + C2_3 + C2_4 + C3_1 + C3_2 + C3_3

#Correlate residuals of negatively worded items
A2_2.rev~~A2_5.rev'


fit.facets<-cfa(facets.model, data = dataf, estimator = "MLR")
summary(fit.facets, fit.measures = TRUE, standardized = TRUE)
resid(fit.facets,type="cor")

fit.values<-cfa(values.model, data = dataf, estimator = "MLR")
summary(fit.values, fit.measures = TRUE, standardized = TRUE)






##############  Reliability on the measures   ############################

#omega
compRelSEM(fit.facets,tau.eq=FALSE,return.total=TRUE)




################################   Multiple Linear Regression (MLR) Analysis   ###########################################


##########  Data Cleaning for MLR  #########################

### Create a column for correct response to each of the 16 BioVEDA items from Hicks et al. (2020)
#Q5, Q7, Q14, and Q18 are not part of Hicks et al. final assessment - removed from analyses
#Binary column: 1 = correct; 0 = wrong
dataf$Q1.c<-ifelse(dataf$Q1==3,1,0)
dataf$Q2.c<-ifelse(dataf$Q2==3,1,0)
dataf$Q3.c<-ifelse(dataf$Q3MC==1,1,0)
dataf$Q4.c<-ifelse(dataf$Q4==2,1,0)
dataf$Q6.c<-ifelse(dataf$Q6==2,1,0)
dataf$Q8.c<-ifelse(dataf$Q8==1,1,0)
dataf$Q9.c<-ifelse(dataf$Q9==2,1,0)
dataf$Q10.c<-ifelse(dataf$Q10==1,1,0)
dataf$Q11.c<-ifelse(dataf$Q11==1,1,0)
dataf$Q12.c<-ifelse(dataf$Q12==1,1,0)
dataf$Q13.c<-ifelse(dataf$Q13==4,1,0)
dataf$Q15.c<-ifelse(dataf$Q15==4,1,0)
dataf$Q16.c<-ifelse(dataf$Q16MC==2,1,0)
dataf$Q17.c<-ifelse(dataf$Q17==4,1,0)
dataf$Q19.c<-ifelse(dataf$Q19==1,1,0)
dataf$Q20.c<-ifelse(dataf$Q20==3,1,0)

#Create a total BioVEDA score for each student by summing # of correct responses
dataf$BV<-rowSums(dataf[,c("Q1.c","Q2.c","Q3.c","Q4.c","Q6.c",
                           "Q8.c","Q9.c","Q10.c","Q11.c","Q12.c","Q13.c",
                           "Q15.c","Q16.c","Q17.c","Q19.c","Q20.c")])


#Create a total BioVEDA score for 14 items (short BioVEDA score)
#these are the items that all students did across all 4 semesters
#sum correct responses
dataf$BV.short<-rowSums(dataf[,c("Q1.c","Q2.c","Q4.c","Q6.c",
                                 "Q8.c","Q9.c","Q10.c","Q11.c","Q12.c","Q13.c",
                                 "Q15.c","Q17.c","Q19.c","Q20.c")])


##Are full BioVEDA scores (16 items) and short BioVEDA scores (14 items) correlated?
cor(dataf$BV,dataf$BV.short,use="complete.obs")
#cor=0.97
#yes, very high correlation - use total score for short BioVEDA (14 items) to increase sample size

#Histogram of BioVEDA short scores (14 items)
hist(dataf$BV.short,breaks=c(0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5,14.5))
summary(dataf$BV.short)
#approximately a normal distribution
#range is 1-14 out of 14 possible; most students score between 5-9

#Mean and SD of BioVEDA short items
mean(dataf$BV.short,na.rm=T)
sd(dataf$BV.short,na.rm=T)



####### Create subset containing complete data for MLR #############

#Data set with complete values for short BV only
dataf.4<-subset(dataf,!is.na(BV.short))
#n = 263




############  Sensitivity Analysis: Re-run CFA with subsetted data  ##########

subset.model<-'
Interest = ~I1_1 + I1_2 + I1_3 + I1_4
Attain1 =~ A1_1 + A1_2 + A1_3 + A1_4
Attain2 =~ A2_1 + A2_2.rev + A2_3 + A2_4 + A2_5.rev + A2_6
Utility1 = ~U1_1 + U1_2
Utility2 = ~U2_1 + U2_2 + U2_3
Utility3 =~ U3_1 + U3_2 + U3_3
Utility4 = ~U4_1 + U4_2
Cost1 = ~C1_1 + C1_2 + C1_3 + C1_4
Cost2 = ~C2_1 + C2_2 + C2_3 + C2_4
Cost3 =~ C3_1 + C3_2 + C3_3

#Correlate residuals of negatively worded items
A2_2.rev~~A2_5.rev
'

fit.subset<-cfa(subset.model,data=dataf.4,estimator="MLR")
summary(fit.subset,fit.measures=T,standardized=T, rsquare=T)
#n=263






################  MLR  #########################

#Create composite scores for each task-value facet by taking the means of the items
dataf.4$Int<-rowMeans(dataf.4[,c("I1_1","I1_2","I1_3","I1_4")])
dataf.4$AttImpAch<-rowMeans(dataf.4[,c("A1_1","A1_2","A1_3","A1_4")])
dataf.4$AttPersImp<-rowMeans(dataf.4[,c("A2_1","A2_2.rev","A2_3","A2_4","A2_5.rev","A2_6")])
dataf.4$UtySch<-rowMeans(dataf.4[,c("U1_1","U1_2")])
dataf.4$UtyDL<-rowMeans(dataf.4[,c("U2_1","U2_2","U2_3")])
dataf.4$UtySoc<-rowMeans(dataf.4[,c("U3_1","U3_2","U3_3")])
dataf.4$UtyJob<-rowMeans(dataf.4[,c("U4_1","U4_2")])
dataf.4$CstEff<-rowMeans(dataf.4[,c("C1_1","C1_2","C1_3","C1_4")])
dataf.4$CstEmo<-rowMeans(dataf.4[,c("C2_1","C2_2","C2_3","C2_4")])
dataf.4$CstOpp<-rowMeans(dataf.4[,c("C3_1","C3_2","C3_3")])

#Check correlations of task-value facets to get a sense of whether multicollinearity might be an issue
#Also to see raw correlations between task values and BioVEDA scores
cor(cbind(dataf.4$Int,dataf.4$AttImpAch,dataf.4$AttPersImp,dataf.4$UtySch,dataf.4$UtyDL,
          dataf.4$UtySoc,dataf.4$UtyJob,dataf.4$CstEff,dataf.4$CstEmo,dataf.4$CstOpp,dataf.4$BV.short))
#only correlations above 0.7: Effort Cost & Emotional Cost



#Make semester a categorical variable with the first semester as the reference semester
dataf.4$semester<-relevel(factor(dataf.4$semester),ref="1")

#Fit a regression model
fit.glm.1<-lm(BV.short ~ Int + AttImpAch + AttPersImp + UtySch + UtyDL + UtySoc + UtyJob + CstEff + CstEmo + CstOpp + semester, data=dataf.4)
summary(fit.glm.1)
#overall regression model is significant (p < 0.001) with adjusted R2 of 0.127
#of the task values, only Emotional cost is significant

#Check assumptions of MLR
plot(fit.glm.1)
#it actually looks good...
check_model(fit.glm.1)
#looks good using this package, too!

#Calculate the variance inflation factor to look for multicollinearity issues
fit.vif<-vif(fit.glm.1)
fit.vif
#square GVIF^(1/(2*df)) to get values analogous to traditional VIFs
sq.GVIF<-fit.vif[,3]^2
sq.GVIF





######### Model Selection of All Possible Nested Regression Models #########

#run overall model to see what comes out "best"
fit.glm.dredge<-lm(BV.short ~ Int + AttImpAch + AttPersImp + UtySch + UtyDL + UtySoc + UtyJob + CstEff + CstEmo + CstOpp + semester, data=dataf.4, na.action = "na.fail")
dd<-dredge(fit.glm.dredge,fixed="semester")
head(dd,n=10)

#Run regression of the "best" model
fit.glm.2<-lm(BV.short ~ AttImpAch + CstEmo + semester, data=dataf.4)
summary(fit.glm.2)

#Check assumptions of MLR
plot(fit.glm.2)
#it actually looks good...
check_model(fit.glm.2)
#looks good using this package, too!

#Calculate the variance inflation factor to look for multicollinearity issues
fit.vif.2<-vif(fit.glm.2)
fit.vif.2
#square GVIF^(1/(2*df)) to get values analogous to traditional VIFs
sq.GVIF.2<-fit.vif.2[,3]^2
sq.GVIF.2


####  Get AICc values to two decimal places for each of the best models

fit.AIC.1<-lm(BV.short ~ AttImpAch + CstEmo + semester, data=dataf.4)
fit.AIC.2<-lm(BV.short ~ AttImpAch + AttPersImp + CstEmo + semester, data=dataf.4)
fit.AIC.3<-lm(BV.short ~ AttImpAch + CstEmo + CstEff + semester, data=dataf.4)
fit.AIC.4<-lm(BV.short ~ AttImpAch + CstEmo + UtyJob + semester, data=dataf.4)
fit.AIC.5<-lm(BV.short ~ AttImpAch + CstEmo + UtyDL + semester, data=dataf.4)
fit.AIC.6<-lm(BV.short ~ AttImpAch + CstEmo + UtySch + semester, data=dataf.4)

AICc(fit.AIC.1,fit.AIC.2,fit.AIC.3,fit.AIC.4,fit.AIC.5,fit.AIC.6)
